Advanced Data Visualization Theory DA-1 Prashanth.S 19MID0020
Data-sets used data-set1 : https://raw.githubusercontent.com/ScienceParkStudyGroup/r-lesson-based-on-ohi-data-training/gh-pages/data/ca.csv
dat-set 2: https://www.kaggle.com/code/adhok93/zomato-eda-in-r/data?select=zomato.csv
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
require(devtools)
## Loading required package: devtools
## Loading required package: usethis
# National Parks in California
ca = read_csv("https://raw.githubusercontent.com/ScienceParkStudyGroup/r-lesson-based-on-ohi-data-training/gh-pages/data/ca.csv")
## Rows: 789 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): region, state, code, park_name, type
## dbl (2): visitors, year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(ca)
colnames(ca)
## [1] "region" "state" "code" "park_name" "type" "visitors"
## [7] "year"
str(ca)
## spec_tbl_df [789 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ region : chr [1:789] "PW" "PW" "PW" "PW" ...
## $ state : chr [1:789] "CA" "CA" "CA" "CA" ...
## $ code : chr [1:789] "CHIS" "CHIS" "CHIS" "CHIS" ...
## $ park_name: chr [1:789] "Channel Islands National Park" "Channel Islands National Park" "Channel Islands National Park" "Channel Islands National Park" ...
## $ type : chr [1:789] "National Park" "National Park" "National Park" "National Park" ...
## $ visitors : num [1:789] 1200 1500 1600 300 15700 ...
## $ year : num [1:789] 1963 1964 1965 1966 1967 ...
## - attr(*, "spec")=
## .. cols(
## .. region = col_character(),
## .. state = col_character(),
## .. code = col_character(),
## .. park_name = col_character(),
## .. type = col_character(),
## .. visitors = col_double(),
## .. year = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
labs = c(
'CHIS'='Channel Islands National Park',
'DEVA'='Death Valley National Park',
'JOTR'='Joshua Tree National Park',
'KICA'='Kings Canyon National Park',
'LAVO'='Lassen Volcanic National Park',
'YOSE'='Yosemite National Park',
'SEQU'='Sequoia National Park',
'REDW'='Redwood National Park',
'PINN'='Pinnacles National Park'
)
bar_plot = ggplot(data=ca, aes(x = code, y = ..count.. / sum(..count..),fill = factor(code))) +
geom_bar(color='black') +
labs(x = "National Parks", y = "Percentage of National Parks in the data-set",
title = "Occurence of the National Parks in the data-set") +
scale_x_discrete(labels =labs)
scale_y_continuous(labels = scales::percent)
## <ScaleContinuousPosition>
## Range:
## Limits: 0 -- 1
bar_plot + coord_flip()
Yosemite National Parks and Sequoia National Park occurs more often in the data-set
density_plot = ggplot(data=ca, aes(x =log10(visitors))) +
geom_density(fill = "indianred3") +
labs(x = "Number of visitors", y="density", title="Kernal density of Visitors")
density_plot
## Plot-2 –> plotly ### Scatter plot
t <- list(family = "Helvetica",size = 14,color = "blue")
t1 <- list(family = "Times New Roman",color = "red")
t2 <- list(family = "Courier New",size = 14,color = "green")
t3 <- list(family = 'Arial')
scatter_plot = plot_ly(data=ca, x=~year, y=~visitors,color = ~park_name, type='scatter',mode='markers') %>%
layout(
title= list(text = "<b>Year and Visitors"),
legend = list(title = list(text ='<b>National Parks')),
xaxis = list(title = list(text ='<b>Year')),
yaxis = list(title = list(text ='<b>Visitors')),
plot_bgcolor='#e5ecf6')
scatter_plot
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
Yosemite National Park secures a more number of visitors than any-other national parks.
r_group_barchart = data.table::melt(ca, id.vars='park_name') %>%
plot_ly(x = ~park_name, y = ~value, type = 'bar', name = ~variable, color = ~variable) %>%
layout(
title= list(text = "<b>Total Distribution based on the Data-Set"),
legend = list(title = list(text= '<b>Attribute')),
xaxis = list(title = list(text ='<b>Parks')),
yaxis = list(title='Count', text='<b>Count'), barmode = 'group')
## Warning in data.table::melt(ca, id.vars = "park_name"): The melt generic in
## data.table has been passed a spec_tbl_df and will attempt to redirect to the
## relevant reshape2 method; please note that reshape2 is deprecated, and this
## redirection is now deprecated as well. To continue using melt methods from
## reshape2 while both libraries are attached, e.g. melt.list, you can prepend the
## namespace like reshape2::melt(ca). In the next version, this warning will become
## an error.
r_group_barchart
Kings Canyon National Park secures the top in terms of region, state, national park.
df_order = data.frame(table(ca$park_name))
df_order
pie_chart = plot_ly(type='pie', labels=df_order$Var1, values=df_order$Freq,
textinfo='label+percent',insidetextorientation='radial') %>%
layout(
title= list(text = "<b>Order Distributions"),
legend = list(title = list(text= '<b>Order')))
pie_chart
histogram_plot = plot_ly(data = ca, x = ~(log(visitors)), name=~code,type="histogram") %>%
layout(
title= list(text = "<b>Total sleep time of Animals based on Vore"),
legend = list(title = list(text= '<b>Vore')),
xaxis = list(title = list(text ='<b>Visitors')),
yaxis = list(title = list(text ='<b>Count')))
histogram_plot
SEQU and LAVO national parks has more number of visitors
df_vore = data.frame(table(ca$code))
df_vore
donut_chart = plot_ly(labels=df_vore$Var1, values=df_vore$Freq,
textinfo='label+percent') %>%
add_pie(hole = 0.6) %>%
layout(
title= list(text = "<b>Order Distributions"),
legend = list(title = list(text= '<b>Order')))
donut_chart
SEQU, YOSE and KICA national parks has more number of visitors with respect to pie-chart.
library(dplyr)
df = read.csv('zomato.csv')
head(df)
names(df)
## [1] "Restaurant.ID" "Restaurant.Name" "Country.Code"
## [4] "City" "Address" "Locality"
## [7] "Locality.Verbose" "Longitude" "Latitude"
## [10] "Cuisines" "Average.Cost.for.two" "Currency"
## [13] "Has.Table.booking" "Has.Online.delivery" "Is.delivering.now"
## [16] "Switch.to.order.menu" "Price.range" "Aggregate.rating"
## [19] "Rating.color" "Rating.text" "Votes"
str(df)
## 'data.frame': 9551 obs. of 21 variables:
## $ Restaurant.ID : int 6317637 6304287 6300002 6318506 6314302 18189371 6300781 6301290 6300010 6314987 ...
## $ Restaurant.Name : chr "Le Petit Souffle" "Izakaya Kikufuji" "Heat - Edsa Shangri-La" "Ooma" ...
## $ Country.Code : int 162 162 162 162 162 162 162 162 162 162 ...
## $ City : chr "Makati City" "Makati City" "Mandaluyong City" "Mandaluyong City" ...
## $ Address : chr "Third Floor, Century City Mall, Kalayaan Avenue, Poblacion, Makati City" "Little Tokyo, 2277 Chino Roces Avenue, Legaspi Village, Makati City" "Edsa Shangri-La, 1 Garden Way, Ortigas, Mandaluyong City" "Third Floor, Mega Fashion Hall, SM Megamall, Ortigas, Mandaluyong City" ...
## $ Locality : chr "Century City Mall, Poblacion, Makati City" "Little Tokyo, Legaspi Village, Makati City" "Edsa Shangri-La, Ortigas, Mandaluyong City" "SM Megamall, Ortigas, Mandaluyong City" ...
## $ Locality.Verbose : chr "Century City Mall, Poblacion, Makati City, Makati City" "Little Tokyo, Legaspi Village, Makati City, Makati City" "Edsa Shangri-La, Ortigas, Mandaluyong City, Mandaluyong City" "SM Megamall, Ortigas, Mandaluyong City, Mandaluyong City" ...
## $ Longitude : num 121 121 121 121 121 ...
## $ Latitude : num 14.6 14.6 14.6 14.6 14.6 ...
## $ Cuisines : chr "French, Japanese, Desserts" "Japanese" "Seafood, Asian, Filipino, Indian" "Japanese, Sushi" ...
## $ Average.Cost.for.two: int 1100 1200 4000 1500 1500 1000 2000 2000 6000 1100 ...
## $ Currency : chr "Botswana Pula(P)" "Botswana Pula(P)" "Botswana Pula(P)" "Botswana Pula(P)" ...
## $ Has.Table.booking : chr "Yes" "Yes" "Yes" "No" ...
## $ Has.Online.delivery : chr "No" "No" "No" "No" ...
## $ Is.delivering.now : chr "No" "No" "No" "No" ...
## $ Switch.to.order.menu: chr "No" "No" "No" "No" ...
## $ Price.range : int 3 3 4 4 4 3 4 4 4 3 ...
## $ Aggregate.rating : num 4.8 4.5 4.4 4.9 4.8 4.4 4 4.2 4.9 4.8 ...
## $ Rating.color : chr "Dark Green" "Dark Green" "Green" "Dark Green" ...
## $ Rating.text : chr "Excellent" "Excellent" "Very Good" "Excellent" ...
## $ Votes : int 314 591 270 365 229 336 520 677 621 532 ...
bar_plot = ggplot(data=df, aes(x = `Rating.color`, y = ..count.. / sum(..count..),fill = factor(`Rating.color`))) +
geom_bar(color='black') +
labs(x = "National Parks", y = "Percentage of National Parks in the data-set",
title = "Occurence of the National Parks in the data-set") +
scale_y_continuous(labels = scales::percent)
bar_plot + coord_flip()
df %>%
filter(Country.Code == 1) %>%
select(Restaurant.ID) %>%
unique() %>%
nrow()
## [1] 8652
There are 8652 restaurants operating in India
df %>% filter(Country.Code == 1) %>% select(Restaurant.ID,City) %>% unique() %>% group_by(City) %>% summarise(n=n()) %>%
ggplot(aes(x=reorder(`City`,n),y=n))+ geom_bar(stat = 'identity',fill='#cb202d') +
coord_flip() +
theme(panel.background = element_blank(),
strip.background = element_blank(),
axis.title = element_text(color = '#2d2d2d'),
strip.text.x = element_text(color='#2d2d2d',face='bold',size=10),
plot.title = element_text(hjust=0.5,face='bold',size=15))+
labs(x='City',y='Number of Restaurants',title="Number of Restaurants by City")
There are more restaurants in Delhi which operates with Zomato
df_india = df %>% filter(Country.Code == 1 & City == 'Agra')
head(df_india)
ggplot(data=df_india, aes(x=Longitude, y=Latitude),
color='white') +
geom_polygon() +
scale_fill_viridis_d(option='A')
library(gganimate)
library(gifski)
scatter_plot_animate = ggplot(data=ca, aes(year, visitors)) + geom_point() +
transition_states(park_name)
animate(scatter_plot_animate, renderer = gifski_renderer())
## filtering out only 'CHIS', 'DEVA' AND 'JOTR' Park_code
d = ca %>%
filter(code%in%c('CHIS', 'DEVA', 'JOTR'))
d
line_plot = ggplot(d, aes(x=year, y=visitors, group=code, color=code)) +
geom_line() + geom_point() +
transition_reveal(year)
animate(line_plot, width=300, height=300, renderer = gifski_renderer())
Visitors started to visit Channel Islands National Park in 1960’s (latest among) has very low visitors till date. Joshua Tree National Park has a continuous growth of visitors without a big drop. Death Valley National Park has a continuous growth of visitors in 2010’s and a gradual pick-up.